\newcommand{\Var}{\textnormal{Var}} 
\newcommand{\exercise}[4]{\handout{#1}{#2}{From:
#3}{Assignment #1}}
\documentclass[12pt]{article}
\usepackage{listings}
\usepackage{color}
\usepackage{cse522}
\usepackage{graphicx}
\usepackage{amssymb}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{pstricks,pst-node,pst-tree}

\parskip 0.12in

\begin{document}

Let $\Pr{d = 1} = 0.8$ be the prior information,$o_{1:t}$ be the observations made up to time $t$, $e_o \in \{-1,+1\}$ , $c \in [0,1]$ be the coherence, and $d \in \{+1, -1\}$ be the hidden direction. Let $R(d,a)$ be the intermediate reward function over the hidden state $d$ and $r(\alpha,\beta,a)$ be the reward defined in the belief space. 


Let $A_R = 1, A_L = -1$, and $A_S = 0$, when there is no prior information, we have $\pi(\alpha,\beta) = - \pi(\beta,\alpha)$ since the reward function is anti-symmetric, $r(\alpha,\beta,A_R) = r(\beta,\alpha,A_L)$. However, when the prior information is available, this anti-symmetric relation no longer holds. Figure~\ref{fig:bestfit} shows the policy that fits the experimental data the best by using MCMC search ($100$ runs, each $2000$ iterations). The boundaries between different actions are assymmetric. Action $A_R$ is more preferred given the same $\alpha+\beta$ (the same time). In the later stage of a trial, even at states with $\beta > \alpha$, the optimal action is still $A_R$.

\begin{figure}[h]
  \centering
  \includegraphics[scale = 0.08]{best_fit_policy.jpg}
  \includegraphics[scale = 0.3]{best_fit_prior.jpg}
  \caption{Policy that fits the experimental data the best.}
  \label{fig:bestfit}
\end{figure}

This indicates the intermediate reward function is no longer anti-symmetric. By definition, the reward function $r(\alpha,\beta,a)$ is the expected reward given the belief state $(\alpha,\beta)$:
\begin{align*}
  r(\alpha,\beta,a) &= \E[ R(c,d,a) ]  \\
&= \sum_{i \in \mathcal{I} } R(c_i, d_i, a) \quad \mbox{$\mathcal{I}$: trials that emits $(\alpha=m_R,\beta=m_L)$ observations} 
\end{align*}

In order to compute this expectation, the agents might use Monte Carlo sampling with each sample being the rewards they experienced in previous trials. When the hidden variables in previous trials are drawn from uniform distribution, 
\begin{align*}
r(\alpha,\beta,a)  &= \int_c \sum_d R(c,d,a) \Pr{c,d |\alpha,\beta} \\
 &= [1 - I_x(0.5 | \alpha,\beta)] R(d=1,a) + I_x(0.5 |\alpha,\beta)R(d=-1,a)
\end{align*}
where $I_x(0.5|\alpha,\beta)$ is the CDF of beta distribution. When $\Pr{d = 1} = 0.8$, $80\%$ of previous trials have hidden variable $d=1$. Assuming $c$ is still drawn from uniform distribution. We thus have:
\begin{align*}
  r(\alpha,\beta,a) = \frac{[1 - I_x(0.5|\alpha,\beta)] R(d=1,a) \Pr{d = 1} + I_x(0.5|\alpha,\beta) R(d=-1,a) \Pr{d = -1}}{[1- I_x(0.5|\alpha,\beta)]\Pr{d=1} + I_x(0.5|\alpha,\beta)\Pr{d=-1}} 
\end{align*}

With this reward function, we could recompute the optimal policy, which is shown in figure~\ref{fig:policy_biased}. The optimal policy is similar to the ``best-fit'' policy. 

\begin{figure}[h]
  \centering
  \includegraphics[scale=0.1]{policy_biased.jpg}
  \caption{Optimal Policy with biased prior}
  \label{fig:policy_biased}
\end{figure}

Let $R_P = 450$, $R_N = 0$, we fit the psychometric function to experimental data (human subject) when the prior is neutral $\Pr(d = 1) = 0.5$. With the same set of parameters, we compute the psychometric function and chronometric function when the prior is biased $\Pr(d=1) = 0.8$. As the shown in figure~\ref{fig:prior}(left), the model predictions are quantitatively consistent with the experimental data. The comparisons to the monkey data with the same predictions are shown in the right panel.

\begin{figure}[h]
  \centering
  \includegraphics[scale=0.1]{prior_Lsp.jpg}
  \includegraphics[scale=0.1]{prior_monk.jpg}
  \caption{Model predictions. Green curves for biased prior and blue curves for neutral prior}
  \label{fig:prior}
\end{figure}

\end{document}